Add support for hardware half<->float conversions

author Daniel Sabo <DanielSabo@gmail.com>

Sun, 27 Dec 2015 15:29:55 +0000 (07:29 -0800)

committer Daniel Sabo <DanielSabo@gmail.com>

Fri, 1 Jan 2016 20:36:48 +0000 (12:36 -0800)
author Daniel Sabo <DanielSabo@gmail.com>
Sun, 27 Dec 2015 15:29:55 +0000 (07:29 -0800)
committer Daniel Sabo <DanielSabo@gmail.com>
Fri, 1 Jan 2016 20:36:48 +0000 (12:36 -0800)
diff --git a/babl/babl-cpuaccel.c b/babl/babl-cpuaccel.c

index 4e1683e8702740f32d74414dfcee3e5282e64bfc..59fdcddd31e732d4c39a3cf2bf4ef0e573d28738 100644 (file)
--- a/babl/babl-cpuaccel.c
+++ b/babl/babl-cpuaccel.c
@@ -118,7 +118,8 @@ enum
    ARCH_X86_INTEL_FEATURE_SSSE3    = 1 << 9,
    ARCH_X86_INTEL_FEATURE_SSE4_1   = 1 << 19,
    ARCH_X86_INTEL_FEATURE_SSE4_2   = 1 << 20,
-  ARCH_X86_INTEL_FEATURE_AVX      = 1 << 28
+  ARCH_X86_INTEL_FEATURE_AVX      = 1 << 28,
+  ARCH_X86_INTEL_FEATURE_F16C     = 1 << 29,
  };
  
  #if !defined(ARCH_X86_64) && (defined(PIC) || defined(__PIC__))
@@ -244,6 +245,9 @@ arch_accel_intel (void)
  
      if (ecx & ARCH_X86_INTEL_FEATURE_SSE4_1)
        caps |= BABL_CPU_ACCEL_X86_SSE4_1;
+
+    if (ecx & ARCH_X86_INTEL_FEATURE_F16C)
+      caps |= BABL_CPU_ACCEL_X86_F16C;
  #endif /* USE_SSE */
    }
  #endif /* USE_MMX */
diff --git a/babl/babl-cpuaccel.h b/babl/babl-cpuaccel.h

index 57eb1184c94f3af1e2f9293aa6eda93bcfe52c8c..8040d730fe2cf6f24231c72a0fc22c6ac9d0613b 100644 (file)
--- a/babl/babl-cpuaccel.h
+++ b/babl/babl-cpuaccel.h
@@ -32,6 +32,9 @@ typedef enum
    BABL_CPU_ACCEL_X86_SSE3    = 0x02000000,
    BABL_CPU_ACCEL_X86_SSSE3   = 0x00800000,
    BABL_CPU_ACCEL_X86_SSE4_1  = 0x00400000,
+  /* BABL_CPU_ACCEL_X86_SSE4_2  = 0x00200000, */
+  /* BABL_CPU_ACCEL_X86_AVX     = 0x00080000, */
+  BABL_CPU_ACCEL_X86_F16C    = 0x00040000,
  
    /* powerpc accelerations */
    BABL_CPU_ACCEL_PPC_ALTIVEC = 0x04000000,
diff --git a/configure.ac b/configure.ac

index f09c7ac10bd333055b9624c9cf590d5690a2ae82..28e9af0b89e616a90ee83327e6cce6b02d0aa305 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -303,6 +303,10 @@ AC_ARG_ENABLE(sse4_1,
    [  --enable-sse4_1            enable SSE4_1 support (default=auto)],,
    enable_sse4_1=$enable_sse)
  
+AC_ARG_ENABLE(f16c,
+  [  --enable-f16c            enable hardware half-float support (default=auto)],,
+  enable_f16c=$enable_sse)
+
  if test "x$enable_mmx" = xyes; then
    BABL_DETECT_CFLAGS(MMX_EXTRA_CFLAGS, '-mmmx')
    SSE_EXTRA_CFLAGS=
@@ -378,6 +382,24 @@ if test "x$enable_mmx" = xyes; then
          fi
        fi
  
+      if test "x$enable_f16c" = xyes; then
+        BABL_DETECT_CFLAGS(f16c_flag, '-mf16c')
+        SSE4_1_EXTRA_CFLAGS="$SSE_EXTRA_CFLAGS $f16c_flag"
+
+        AC_MSG_CHECKING(whether we can compile half-floating point code)
+
+        CFLAGS="$CFLAGS $sse_flag $f16c_flag"
+
+        AC_COMPILE_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>],[_mm_cvtph_ps ((__m128i)_mm_setzero_ps());])],
+          AC_DEFINE(USE_F16C, 1, [Define to 1 if f16c intrinsics are available.])
+          AC_MSG_RESULT(yes)
+        ,
+          enable_f16c=no
+          AC_MSG_RESULT(no)
+          AC_MSG_WARN([The compiler does not support f16c intrinsics.])
+        )
+      fi
+
      fi
    ,
      enable_mmx=no
diff --git a/extensions/Makefile.am b/extensions/Makefile.am

index cd7e89311cfa129bec8a8e18f1b2043f21ddc5ec..c06aa8fcad8608c5f79d9cd6c56c1c0f231e4411 100644 (file)
--- a/extensions/Makefile.am
+++ b/extensions/Makefile.am
@@ -32,6 +32,7 @@ ext_LTLIBRARIES = \
         sse2-int8.la    \
         sse2-int16.la   \
         sse4-int8.la    \
+       sse-half.la     \
         two-table.la    \
         ycbcr.la
  
@@ -50,6 +51,7 @@ sse2_float_la_SOURCES = sse2-float.c
  sse2_int8_la_SOURCES = sse2-int8.c
  sse2_int16_la_SOURCES = sse2-int16.c
  sse4_int8_la_SOURCES = sse4-int8.c
+sse_half_la_SOURCES = sse-half.c
  two_table_la_SOURCES = two-table.c two-table-tables.h
  ycbcr_la_SOURCES = ycbcr.c
  float_la_SOURCES = float.c
@@ -62,3 +64,4 @@ sse2_float_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
  sse2_int8_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
  sse2_int16_la_CFLAGS = $(SSE2_EXTRA_CFLAGS)
  sse4_int8_la_CFLAGS = $(SSE4_1_EXTRA_CFLAGS)
+sse_half_la_CFLAGS = $(SSE4_1_EXTRA_CFLAGS) $(F16C_EXTRA_CFLAGS)
diff --git a/extensions/sse-half.c b/extensions/sse-half.c

new file mode 100644 (file)

index 0000000..ca57ceb
--- /dev/null
+++ b/extensions/sse-half.c
@@ -0,0 +1,270 @@
+/* babl - dynamically extendable universal pixel conversion library.
+ * Copyright (C) 2015 Daniel Sabo
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General
+ * Public License along with this library; if not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.h"
+
+#if defined(USE_SSE4_1) && defined(USE_F16C)
+
+#include <immintrin.h>
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "babl.h"
+#include "babl-cpuaccel.h"
+#include "extensions/util.h"
+
+static inline long
+conv_yHalf_yF (const uint16_t *src, float *dst, long samples)
+{
+  const uint64_t *s_vec;
+  __v4sf         *d_vec;
+
+  long n = samples;
+
+  s_vec = (const uint64_t *)src;
+  d_vec = (__v4sf *)dst;
+
+  while (n >= 4)
+    {
+      __m128i in_val = _mm_insert_epi64((__m128i)_mm_setzero_ps(), *s_vec++, 0);
+      __v4sf out_val = (__v4sf)_mm_cvtph_ps(in_val);
+      _mm_storeu_ps((float *)d_vec++, out_val);
+      n -= 4;
+    }
+
+  src = (const uint16_t *)s_vec;
+  dst = (float *)d_vec;
+
+  while (n)
+    {
+      __m128i in_val = _mm_insert_epi16((__m128i)_mm_setzero_ps(), *src++, 0);
+      __v4sf out_val = (__v4sf)_mm_cvtph_ps(in_val);
+      _mm_store_ss(dst++, out_val);
+      n -= 1;
+    }
+
+  return samples;
+}
+
+static long
+conv_yaHalf_yaF (const uint16_t *src, float *dst, long samples)
+{
+  return conv_yHalf_yF (src, dst, samples * 2) / 2;
+}
+
+static long
+conv_rgbHalf_rgbF (const uint16_t *src, float *dst, long samples)
+{
+  return conv_yHalf_yF (src, dst, samples * 3) / 3;
+}
+
+static long
+conv_rgbaHalf_rgbaF (const uint16_t *src, float *dst, long samples)
+{
+  return conv_yHalf_yF (src, dst, samples * 4) / 4;
+}
+
+static inline long
+conv_yF_yHalf (const float *src, uint16_t *dst, long samples)
+{
+  const __v4sf *s_vec;
+  uint64_t     *d_vec;
+
+  long n = samples;
+
+  s_vec = (const __v4sf *)src;
+  d_vec = (uint64_t *)dst;
+
+  while (n >= 4)
+    {
+      __m128 in_val = _mm_loadu_ps((float *)s_vec++);
+      __m128i out_val = _mm_cvtps_ph(in_val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+      _mm_storel_epi64((__m128i *)d_vec++, out_val);
+      n -= 4;
+    }
+
+  src = (const float *)s_vec;
+  dst = (uint16_t *)d_vec;
+
+  while (n)
+    {
+      __m128 in_val = _mm_load_ss(src++);
+      __m128i out_val = _mm_cvtps_ph(in_val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+      *dst++ = _mm_extract_epi16(out_val, 0);
+      n -= 1;
+    }
+
+  return samples;
+}
+
+static long
+conv_yaF_yaHalf (const float *src, uint16_t *dst, long samples)
+{
+  return conv_yF_yHalf (src, dst, samples * 2) / 2;
+}
+
+static long
+conv_rgbF_rgbHalf (const float *src, uint16_t *dst, long samples)
+{
+  return conv_yF_yHalf (src, dst, samples * 3) / 3;
+}
+
+static long
+conv_rgbaF_rgbaHalf (const float *src, uint16_t *dst, long samples)
+{
+  return conv_yF_yHalf (src, dst, samples * 4) / 4;
+}
+
+#endif /* defined(USE_SSE4_1) && defined(USE_F16C) */
+
+int init (void);
+
+int
+init (void)
+{
+#if defined(USE_SSE4_1) && defined(USE_F16C)
+  const Babl *rgbaF_linear = babl_format_new (
+    babl_model ("RGBA"),
+    babl_type ("float"),
+    babl_component ("R"),
+    babl_component ("G"),
+    babl_component ("B"),
+    babl_component ("A"),
+    NULL);
+  const Babl *rgbaHalf_linear = babl_format_new (
+    babl_model ("RGBA"),
+    babl_type ("half"),
+    babl_component ("R"),
+    babl_component ("G"),
+    babl_component ("B"),
+    babl_component ("A"),
+    NULL);
+  const Babl *rgbaF_gamma = babl_format_new (
+    babl_model ("R'G'B'A"),
+    babl_type ("float"),
+    babl_component ("R'"),
+    babl_component ("G'"),
+    babl_component ("B'"),
+    babl_component ("A"),
+    NULL);
+  const Babl *rgbaHalf_gamma = babl_format_new (
+    babl_model ("R'G'B'A"),
+    babl_type ("half"),
+    babl_component ("R'"),
+    babl_component ("G'"),
+    babl_component ("B'"),
+    babl_component ("A"),
+    NULL);
+  const Babl *rgbF_linear = babl_format_new (
+    babl_model ("RGB"),
+    babl_type ("float"),
+    babl_component ("R"),
+    babl_component ("G"),
+    babl_component ("B"),
+    NULL);
+  const Babl *rgbHalf_linear = babl_format_new (
+    babl_model ("RGB"),
+    babl_type ("half"),
+    babl_component ("R"),
+    babl_component ("G"),
+    babl_component ("B"),
+    NULL);
+  const Babl *rgbF_gamma = babl_format_new (
+    babl_model ("R'G'B'"),
+    babl_type ("float"),
+    babl_component ("R'"),
+    babl_component ("G'"),
+    babl_component ("B'"),
+    NULL);
+  const Babl *rgbHalf_gamma = babl_format_new (
+    babl_model ("R'G'B'"),
+    babl_type ("half"),
+    babl_component ("R'"),
+    babl_component ("G'"),
+    babl_component ("B'"),
+    NULL);
+  const Babl *yaF_linear = babl_format_new (
+    babl_model ("YA"),
+    babl_type ("float"),
+    babl_component ("Y"),
+    babl_component ("A"),
+    NULL);
+  const Babl *yaHalf_linear = babl_format_new (
+    babl_model ("YA"),
+    babl_type ("half"),
+    babl_component ("Y"),
+    babl_component ("A"),
+    NULL);
+  const Babl *yaF_gamma = babl_format_new (
+    babl_model ("Y'A"),
+    babl_type ("float"),
+    babl_component ("Y'"),
+    babl_component ("A"),
+    NULL);
+  const Babl *yaHalf_gamma = babl_format_new (
+    babl_model ("Y'A"),
+    babl_type ("half"),
+    babl_component ("Y'"),
+    babl_component ("A"),
+    NULL);
+  const Babl *yF_linear = babl_format_new (
+    babl_model ("Y"),
+    babl_type ("float"),
+    babl_component ("Y"),
+    NULL);
+  const Babl *yHalf_linear = babl_format_new (
+    babl_model ("Y"),
+    babl_type ("half"),
+    babl_component ("Y"),
+    NULL);
+  const Babl *yF_gamma = babl_format_new (
+    babl_model ("Y'"),
+    babl_type ("float"),
+    babl_component ("Y'"),
+    NULL);
+  const Babl *yHalf_gamma = babl_format_new (
+    babl_model ("Y'"),
+    babl_type ("half"),
+    babl_component ("Y'"),
+    NULL);
+
+#define CONV(src, dst) \
+{ \
+  babl_conversion_new (src ## _linear, dst ## _linear, "linear", conv_ ## src ## _ ## dst, NULL); \
+  babl_conversion_new (src ## _gamma, dst ## _gamma, "linear", conv_ ## src ## _ ## dst, NULL); \
+}
+
+  if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE4_1) &&
+      (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_F16C))
+    {
+      CONV(rgbaHalf, rgbaF);
+      CONV(rgbHalf,  rgbF);
+      CONV(yaHalf,   yaF);
+      CONV(yHalf,    yF);
+      CONV(rgbaF,    rgbaHalf);
+      CONV(rgbF,     rgbHalf);
+      CONV(yaF,      yaHalf);
+      CONV(yF,       yHalf);
+    }
+
+#endif /* defined(USE_SSE4_1) && defined(USE_F16C) */
+
+  return 0;
+}
+
author	Daniel Sabo <DanielSabo@gmail.com>
	Sun, 27 Dec 2015 15:29:55 +0000 (07:29 -0800)
committer	Daniel Sabo <DanielSabo@gmail.com>
	Fri, 1 Jan 2016 20:36:48 +0000 (12:36 -0800)
babl/babl-cpuaccel.c		patch \| blob \| history
babl/babl-cpuaccel.h		patch \| blob \| history
configure.ac		patch \| blob \| history
extensions/Makefile.am		patch \| blob \| history
extensions/sse-half.c	[new file with mode: 0644]	patch \| blob